In [1]:
import graphlab as gl
In [2]:
train = gl.SFrame.read_csv("../data/train.csv")
In [3]:
test = gl.SFrame.read_csv("../data/test.csv")
In [4]:
desc = gl.SFrame.read_csv("../data/product_descriptions.csv")
In [5]:
attr = gl.SFrame.read_csv("../data/attributes.csv")
In [6]:
# merge train with description
train = train.join(desc, on = 'product_uid', how = 'left')
In [7]:
# merge test with description
test = test.join(desc, on = 'product_uid', how = 'left')
In [8]:
# if some attributes has no so we don't need them
print len(attr)
attr = attr[attr['value'] != "No"]
print len(attr)
In [9]:
# if some attributes has "yes" we compy the value so we can search in it
attr['value'] = attr.apply(lambda x: x['name'] if x['value'] == "Yes" else x['value'])
Let's select brands
In [10]:
brands = attr[attr['name'] == "MFG Brand Name"]
In [11]:
brands.head()
Out[11]:
Bullets too
In [12]:
bullets = attr[attr['name'].contains("Bullet")]
In [13]:
# converting bullets to columns
bullets = bullets.unstack(column = ['name', 'value'], new_column_name = "bullets")
bullets = bullets.unpack("bullets")
bullets = bullets.sort("product_uid")
print len(bullets)
In [14]:
# merge train with brands and bullets
train = train.join(brands, on = 'product_uid', how = 'left')
train = train.join(bullets, on = 'product_uid', how = 'left')
In [15]:
# merge test with brands and bullets
test = test.join(brands, on = 'product_uid', how = 'left')
test = test.join(bullets, on = 'product_uid', how = 'left')
In [16]:
def calculateTfIdf(cols, data, searchColTfIdfName):
for item in xrange(len(cols)):
colName = cols[item]
newColNameWordCount = colName + "_word_count"
newColNameTfIdf = colName + "_tfidf"
newColDistance = colName + "_distance"
wordCount = gl.text_analytics.count_words(data[colName])
data[newColNameWordCount] = wordCount
tfidf = gl.text_analytics.tf_idf(data[newColNameWordCount])
data[newColNameTfIdf] = tfidf
#print colName
if searchColTfIdfName != colName:
data[newColDistance] = data.apply(lambda x: 0 if x[newColNameTfIdf] is None else gl.distances.cosine(x[searchColTfIdfName],x[newColNameTfIdf]))
return data
In [17]:
# columns = ['search_term', 'product_title', 'product_description', 'value', 'bullets.Bullet01',
# 'bullets.Bullet02', 'bullets.Bullet03', 'bullets.Bullet04', 'bullets.Bullet05', 'bullets.Bullet06'
# , 'bullets.Bullet07', 'bullets.Bullet08', 'bullets.Bullet09', 'bullets.Bullet10', 'bullets.Bullet11'
# , 'bullets.Bullet12', 'bullets.Bullet13', 'bullets.Bullet14', 'bullets.Bullet15', 'bullets.Bullet16'
# , 'bullets.Bullet17', 'bullets.Bullet18', 'bullets.Bullet19', 'bullets.Bullet20', 'bullets.Bullet21'
# , 'bullets.Bullet22']
columns = ['search_term', 'product_title', 'product_description', 'value']
train = calculateTfIdf(columns, train, 'search_term_tfidf')
In [18]:
test = calculateTfIdf(columns, test, 'search_term_tfidf')
In [19]:
featuresDistance = [s for s in train.column_names() if "distance" in s]
print featuresDistance
In [20]:
#train = train.dropna('value_distance')
In [21]:
model1 = gl.linear_regression.create(train, target = 'relevance', features = featuresDistance)
In [22]:
#let's take a look at the weights before we plot
model1.get("coefficients")
Out[22]:
In [23]:
'''
predictions_test = model1.predict(test)
test_errors = predictions_test - test['relevance']
RSS_test = sum(test_errors * test_errors)
print RSS_test
'''
Out[23]:
In [24]:
predictions_test = model1.predict(test)
predictions_test
Out[24]:
In [25]:
submission = gl.SFrame(test['id'])
In [26]:
submission.add_column(predictions_test)
submission.rename({'X1': 'id', 'X2':'relevance'})
Out[26]:
In [27]:
submission['relevance'] = submission.apply(lambda x: 3.0 if x['relevance'] > 3.0 else x['relevance'])
submission['relevance'] = submission.apply(lambda x: 1.0 if x['relevance'] < 1.0 else x['relevance'])
In [28]:
submission['relevance'] = submission.apply(lambda x: str(x['relevance']))
In [29]:
submission.export_csv('../data/submission.csv', quote_level = 3)
In [ ]:
#gl.canvas.set_target('ipynb')